library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(sjPlot)
library(ggsci)
theme_set(
theme_classic(18) +
theme(legend.position = "bottom",
panel.grid.minor = element_blank()))
clean_over_balls = function(x){
x %>%
tidyr::separate(
col = "ball",
into = c("over", "over_ball"),
sep = "[.]", remove = FALSE) %>%
dplyr::mutate(
ball = as_factor(ball),
over = over %>% as.integer,
over_ball = over_ball %>% as.integer(),
wicket = ifelse(wicket_out %>% complete.cases, "yes", "no") %>% factor())
}
raw_tbl = readRDS("data/RData/ball_by_ball_22_Aug_2020.rds")
test1 = raw_tbl %>% slice(1) %>% unnest(match_tbl) %>% unnest(innings_tbl) %>%
clean_over_balls
test1 %>% glimpse()
## Rows: 2,499
## Columns: 27
## $ filename <chr> "data/ball_by_ball_raw//1000851.yaml", "data/ball_by_b…
## $ venue <chr> "Western Australia Cricket Association Ground", "Weste…
## $ team1 <chr> "Australia", "Australia", "Australia", "Australia", "A…
## $ team2 <chr> "South Africa", "South Africa", "South Africa", "South…
## $ date <chr> "2016-11-03--2016-11-04--2016-11-05--2016-11-06--2016-…
## $ gender <chr> "male", "male", "male", "male", "male", "male", "male"…
## $ toss_winner <chr> "South Africa", "South Africa", "South Africa", "South…
## $ toss_decision <chr> "bat", "bat", "bat", "bat", "bat", "bat", "bat", "bat"…
## $ match_type <chr> "Test", "Test", "Test", "Test", "Test", "Test", "Test"…
## $ winning_team <chr> "South Africa", "South Africa", "South Africa", "South…
## $ winning_type <chr> "runs", "runs", "runs", "runs", "runs", "runs", "runs"…
## $ winning_margin <chr> "177", "177", "177", "177", "177", "177", "177", "177"…
## $ innings <chr> "1st innings", "1st innings", "1st innings", "1st inni…
## $ batsman <chr> "SC Cook", "SC Cook", "SC Cook", "SC Cook", "HM Amla",…
## $ bowler <chr> "MA Starc", "MA Starc", "MA Starc", "MA Starc", "MA St…
## $ non_striker <chr> "D Elgar", "D Elgar", "D Elgar", "D Elgar", "D Elgar",…
## $ runs_batsman <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, …
## $ runs_extras <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ runs_total <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0, …
## $ ball <fct> 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1.1, 1.2, 1.3, 1.4, 1.5,…
## $ over <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, …
## $ over_ball <int> 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, …
## $ wicket_kind <chr> NA, NA, NA, "caught", NA, NA, NA, NA, NA, NA, NA, NA, …
## $ wicket_out <chr> NA, NA, NA, "SC Cook", NA, NA, NA, NA, NA, NA, NA, NA,…
## $ extras_kind <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ batting_team <chr> "South Africa", "South Africa", "South Africa", "South…
## $ wicket <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no, n…
test1 %>%
ggplot(aes(x = over, y = runs_batsman,
colour = wicket,
alpha = wicket,
size = wicket)) +
geom_point() +
scale_y_continuous(breaks = 0:6) +
scale_colour_brewer(palette = "Set1") +
scale_alpha_manual(values = c("yes" = 1, "no" = 0.05)) +
scale_size_manual(values = c("yes" = 3, "no" = 1)) +
facet_wrap(~innings, scales = "free_x")
lyon = test1 %>%
dplyr::filter(bowler == "NM Lyon")
lyon %>%
ggplot(aes(x = over, y = runs_batsman,
colour = wicket,
alpha = wicket,
size = wicket)) +
geom_point() +
scale_y_continuous(breaks = 0:6) +
scale_colour_brewer(palette = "Set1") +
scale_alpha_manual(values = c("yes" = 1, "no" = 0.5)) +
scale_size_manual(values = c("yes" = 3, "no" = 1)) +
facet_wrap(~innings, scales = "free_x")
lyon %>%
ggplot(aes(x = over_ball, y = runs_batsman,
colour = wicket,
alpha = wicket,
size = wicket)) +
geom_point() +
scale_y_continuous(breaks = 0:6) +
scale_colour_brewer(palette = "Set1") +
scale_alpha_manual(values = c("yes" = 1, "no" = 0.5)) +
scale_size_manual(values = c("yes" = 3, "no" = 1)) +
facet_wrap(~innings, scales = "free_x")
Maybe use a series of piecharts.
test1 %>%
ggplot(aes(x = over_ball, y = runs_batsman,
colour = wicket,
alpha = wicket,
size = wicket)) +
geom_point() +
scale_y_continuous(breaks = 0:6) +
scale_colour_brewer(palette = "Set1") +
scale_alpha_manual(values = c("yes" = 1, "no" = 0.5)) +
scale_size_manual(values = c("yes" = 3, "no" = 1)) +
facet_wrap(~innings, scales = "free_x")
M1 = glm(wicket ~ over + factor(over_ball), family = "binomial", data = test1 %>% dplyr::filter(over_ball <= 6))
summary(M1)
##
## Call:
## glm(formula = wicket ~ over + factor(over_ball), family = "binomial",
## data = test1 %>% dplyr::filter(over_ball <= 6))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2079 -0.1939 -0.1840 -0.1549 3.1546
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.993581 0.429916 -9.289 <2e-16 ***
## over 0.001033 0.003950 0.262 0.794
## factor(over_ball)2 -0.987762 0.680564 -1.451 0.147
## factor(over_ball)3 0.005329 0.504902 0.011 0.992
## factor(over_ball)4 -0.128180 0.522286 -0.245 0.806
## factor(over_ball)5 -0.467112 0.574386 -0.813 0.416
## factor(over_ball)6 -0.125704 0.522292 -0.241 0.810
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.01 on 2480 degrees of freedom
## Residual deviance: 389.48 on 2474 degrees of freedom
## AIC: 403.48
##
## Number of Fisher Scoring iterations: 7
plot_model(M1, type = "pred", digits = 4)
## Data were 'prettified'. Consider using `terms="over [all]"` to get smooth plots.
## $over
##
## $over_ball
all_tests = raw_tbl %>% unnest(match_tbl) %>%
dplyr::mutate(total_balls_match = purrr::map_int(innings_tbl, nrow)) %>%
unnest(innings_tbl) %>%
clean_over_balls() %>%
dplyr::filter(over_ball <= 6)
M2 = glm(wicket ~ over + factor(over_ball), family = "binomial", data = all_tests)
summary(M2)
##
## Call:
## glm(formula = wicket ~ over + factor(over_ball), family = "binomial",
## data = all_tests)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.2206 -0.1847 -0.1792 -0.1749 2.9300
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.2787209 0.0224222 -190.825 < 2e-16 ***
## over 0.0019175 0.0001977 9.698 < 2e-16 ***
## factor(over_ball)2 0.0659485 0.0272061 2.424 0.015349 *
## factor(over_ball)3 0.0906678 0.0270606 3.351 0.000807 ***
## factor(over_ball)4 0.0640932 0.0272409 2.353 0.018631 *
## factor(over_ball)5 0.0734183 0.0271930 2.700 0.006936 **
## factor(over_ball)6 0.1016391 0.0270284 3.760 0.000170 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 173682 on 1049857 degrees of freedom
## Residual deviance: 173573 on 1049851 degrees of freedom
## AIC: 173587
##
## Number of Fisher Scoring iterations: 7
plot_model(M2, type = "pred", digits = 10)
## Data were 'prettified'. Consider using `terms="over [all]"` to get smooth plots.
## $over
##
## $over_ball
# qqnorm(M$residuals)
# qqline(M$residuals)
library(lme4)
M3 = glmer(wicket ~ over + factor(over_ball) + (1 | venue),
family = "binomial", data = all_tests)
plot_model(M3, type = "pred", digits = 4)
The hypothesis is that prior to taking wickets, batsmen are typically strangled to taking runs.
The following visualise the runs before taking wicket.
# cum_balls_before_wkt = purrr::map(.x = which(all_tests$wicket == "yes"), .f = ~ seq(.x, 0, by = -1))
where_wkts = which(all_tests$wicket == "yes")
cum_balls_before_wkt = rep(NA, length(all_tests$wicket))
last_wkt = 0
for(wkt in where_wkts){
cum_balls_before_wkt[(last_wkt + 1L):wkt] = seq(wkt - last_wkt - 1L, 0L, by = -1)
last_wkt = wkt
}
pressure = all_tests %>%
dplyr::mutate(
cum_all_wkt = cumsum(wicket == "yes"),
cum_balls_before_wkt = cum_balls_before_wkt) %>%
group_by(filename, batting_team, innings) %>%
dplyr::mutate(
cum_inns_wkt = cumsum(wicket == "yes")) %>%
ungroup()
# View(pressure %>% slice(1:100))
pressure %>%
dplyr::filter(filename == "data/ball_by_ball_raw//1000851.yaml") %>%
ggplot(aes(x = cum_balls_before_wkt, y = runs_total)) +
geom_point(alpha = 0.5) +
scale_y_continuous(breaks = 0:6) +
scale_x_continuous(trans = "log1p", breaks = c(0:18))
pressure_perc = pressure %>%
dplyr::filter(cum_balls_before_wkt <= 18) %>%
group_by(cum_balls_before_wkt, runs_total) %>%
dplyr::tally() %>%
dplyr::mutate(perc = n/sum(n)) %>%
ungroup()
pressure_perc %>%
ggplot(aes(x = cum_balls_before_wkt,
y = perc)) +
geom_col(aes(fill = factor(runs_total))) +
scale_fill_d3()
pressure_perc %>%
ggplot(aes(x = cum_balls_before_wkt,
y = perc, colour = factor(runs_total))) +
geom_line(size = 1.5) +
scale_colour_d3() +
scale_x_continuous(breaks = c(1:10))
bowler_pressure = pressure %>%
group_by(filename) %>%
dplyr::mutate(wkt_bowler = ifelse(wicket == "yes", bowler, NA)) %>%
tidyr::fill(wkt_bowler, .direction = "down") %>%
ungroup() %>%
group_by(wkt_bowler) %>%
nest() %>%
dplyr::mutate(total_balls = purrr::map_int(data, nrow))
bowler_pressure_perc = bowler_pressure %>%
dplyr::filter(complete.cases(wkt_bowler)) %>%
ungroup() %>%
# dplyr::filter(rank(desc(total_balls)) <= 50) %>%
dplyr::filter(rank(desc(total_balls)) <= 11 | wkt_bowler == "SR Watson") %>%
# dplyr::filter(rank(desc(total_balls)) <= 11 | wkt_bowler == "") %>%
# dplyr::filter(wkt_bowler %in% c("JM Anderson", "NM Lyon")) %>%
# dplyr::filter(rank(desc(total_balls)) <= 10) %>%
unnest(data) %>%
dplyr::filter(cum_balls_before_wkt <= 12, cum_balls_before_wkt > 0) %>%
group_by(wkt_bowler, cum_balls_before_wkt, runs_total) %>%
dplyr::tally() %>%
dplyr::mutate(perc = n/sum(n)) %>%
dplyr::filter(perc < 1) %>%
ungroup()
bowler_pressure_perc %>%
ggplot(aes(x = cum_balls_before_wkt,
y = perc,
colour = factor(runs_total),
group = interaction(runs_total, wkt_bowler)
)) +
geom_line(size = 0.5) +
scale_colour_d3() +
scale_x_continuous(breaks = c(0:12)) +
scale_y_continuous(breaks = seq(0, 1, by = 0.25)) +
labs(x = "Num of runs before taking wicket",
y = "Percentage averaged across 12 balls",
colour = "Runs")
bowler_pressure_perc %>%
ggplot(aes(x = cum_balls_before_wkt,
y = perc)) +
geom_col(aes(fill = factor(runs_total))) +
geom_hline(yintercept = 0.25) +
# geom_hline(yintercept = 0.12) +
scale_fill_d3() +
scale_x_continuous(breaks = c(0:12)) +
facet_wrap(~wkt_bowler) +
labs(x = "Num of runs before taking wicket",
y = "Percentage averaged across 12 balls",
fill = "Runs")
pressure %>%
group_by(cum_all_wkt) %>%
summarise(
wkt_taking_bowler = mean(unique(bowler) == last(bowler))) %>%
ggplot(aes(x = wkt_taking_bowler)) +
geom_histogram()
## `summarise()` ungrouping output (override with `.groups` argument)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# M3 = glm(wicket ~ factor(over_ball) * cum_balls_before_wkt, family = "binomial", data = pressure)
#
# summary(M3)
sessioninfo::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 3.6.2 (2019-12-12)
## os macOS Mojave 10.14.6
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_AU.UTF-8
## ctype en_AU.UTF-8
## tz Australia/Melbourne
## date 2020-08-29
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.0)
## backports 1.1.8 2020-06-17 [1] CRAN (R 3.6.2)
## bayestestR 0.7.0 2020-06-19 [1] CRAN (R 3.6.2)
## blob 1.2.1 2020-01-20 [1] CRAN (R 3.6.0)
## boot 1.3-25 2020-04-26 [1] CRAN (R 3.6.2)
## broom 0.7.0 2020-07-09 [1] CRAN (R 3.6.2)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 3.6.0)
## cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.0)
## coda 0.19-3 2019-07-05 [1] CRAN (R 3.6.0)
## codetools 0.2-16 2018-12-24 [1] CRAN (R 3.6.2)
## colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.0)
## crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.0)
## DBI 1.1.0 2019-12-15 [1] CRAN (R 3.6.0)
## dbplyr 1.4.4 2020-05-27 [1] CRAN (R 3.6.2)
## digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.0)
## dplyr * 1.0.0 2020-05-29 [1] CRAN (R 3.6.2)
## effectsize 0.3.1 2020-05-19 [1] CRAN (R 3.6.2)
## ellipsis 0.3.1 2020-05-15 [1] CRAN (R 3.6.2)
## emmeans 1.4.8 2020-06-26 [1] CRAN (R 3.6.2)
## estimability 1.3 2018-02-11 [1] CRAN (R 3.6.0)
## evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.0)
## fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.0)
## farver 2.0.3 2020-01-16 [1] CRAN (R 3.6.0)
## forcats * 0.5.0 2020-03-01 [1] CRAN (R 3.6.2)
## fs 1.4.2 2020-06-30 [1] CRAN (R 3.6.2)
## generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.0)
## ggeffects 0.15.0 2020-06-16 [1] CRAN (R 3.6.2)
## ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 3.6.2)
## ggsci * 2.9 2018-05-14 [1] CRAN (R 3.6.0)
## glue 1.4.1 2020-05-13 [1] CRAN (R 3.6.2)
## gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.0)
## haven 2.3.1 2020-06-01 [1] CRAN (R 3.6.2)
## hms 0.5.3 2020-01-08 [1] CRAN (R 3.6.0)
## htmltools 0.5.0 2020-06-16 [1] CRAN (R 3.6.2)
## httr 1.4.1 2019-08-05 [1] CRAN (R 3.6.0)
## insight 0.8.5 2020-06-08 [1] CRAN (R 3.6.2)
## jsonlite 1.7.0 2020-06-25 [1] CRAN (R 3.6.2)
## knitr 1.29 2020-06-23 [1] CRAN (R 3.6.2)
## labeling 0.3 2014-08-23 [1] CRAN (R 3.6.0)
## lattice 0.20-41 2020-04-02 [1] CRAN (R 3.6.2)
## lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.0)
## lme4 1.1-23 2020-04-07 [1] CRAN (R 3.6.2)
## lubridate 1.7.9 2020-06-08 [1] CRAN (R 3.6.2)
## magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.0)
## MASS 7.3-51.6 2020-04-26 [1] CRAN (R 3.6.2)
## Matrix 1.2-18 2019-11-27 [1] CRAN (R 3.6.2)
## minqa 1.2.4 2014-10-09 [1] CRAN (R 3.6.0)
## modelr 0.1.8 2020-05-19 [1] CRAN (R 3.6.2)
## multcomp 1.4-13 2020-04-08 [1] CRAN (R 3.6.2)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.0)
## mvtnorm 1.1-1 2020-06-09 [1] CRAN (R 3.6.2)
## nlme 3.1-148 2020-05-24 [1] CRAN (R 3.6.2)
## nloptr 1.2.2.2 2020-07-02 [1] CRAN (R 3.6.2)
## parameters 0.8.0 2020-06-08 [1] CRAN (R 3.6.2)
## performance 0.4.7 2020-06-14 [1] CRAN (R 3.6.2)
## pillar 1.4.6 2020-07-10 [1] CRAN (R 3.6.2)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.0)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.2)
## R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.0)
## RColorBrewer 1.1-2 2014-12-07 [1] CRAN (R 3.6.0)
## Rcpp 1.0.5 2020-07-06 [1] CRAN (R 3.6.2)
## readr * 1.3.1 2018-12-21 [1] CRAN (R 3.6.0)
## readxl 1.3.1 2019-03-13 [1] CRAN (R 3.6.0)
## reprex 0.3.0 2019-05-16 [1] CRAN (R 3.6.0)
## rlang 0.4.7 2020-07-09 [1] CRAN (R 3.6.2)
## rmarkdown 2.3.1 2020-06-23 [1] Github (rstudio/rmarkdown@b53a85a)
## rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.0)
## rvest 0.3.5 2019-11-08 [1] CRAN (R 3.6.0)
## sandwich 2.5-1 2019-04-06 [1] CRAN (R 3.6.0)
## scales 1.1.1 2020-05-11 [1] CRAN (R 3.6.2)
## sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.0)
## sjlabelled 1.1.6 2020-06-25 [1] CRAN (R 3.6.2)
## sjmisc 2.8.5 2020-05-28 [1] CRAN (R 3.6.2)
## sjPlot * 2.8.4 2020-05-24 [1] CRAN (R 3.6.2)
## sjstats 0.18.0 2020-05-06 [1] CRAN (R 3.6.2)
## snakecase 0.11.0 2019-05-25 [1] CRAN (R 3.6.0)
## statmod 1.4.34 2020-02-17 [1] CRAN (R 3.6.1)
## stringi 1.4.6 2020-02-17 [1] CRAN (R 3.6.1)
## stringr * 1.4.0 2019-02-10 [1] CRAN (R 3.6.0)
## survival 3.2-3 2020-06-13 [1] CRAN (R 3.6.2)
## TH.data 1.0-10 2019-01-21 [1] CRAN (R 3.6.0)
## tibble * 3.0.3 2020-07-10 [1] CRAN (R 3.6.2)
## tidyr * 1.1.0 2020-05-20 [1] CRAN (R 3.6.2)
## tidyselect 1.1.0 2020-05-11 [1] CRAN (R 3.6.2)
## tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 3.6.0)
## utf8 1.1.4 2018-05-24 [1] CRAN (R 3.6.0)
## vctrs 0.3.1 2020-06-05 [1] CRAN (R 3.6.2)
## withr 2.2.0 2020-04-20 [1] CRAN (R 3.6.2)
## xfun 0.15 2020-06-21 [1] CRAN (R 3.6.2)
## xml2 1.3.2 2020-04-23 [1] CRAN (R 3.6.2)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 3.6.0)
## yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.1)
## zoo 1.8-8 2020-05-02 [1] CRAN (R 3.6.2)
##
## [1] /Library/Frameworks/R.framework/Versions/3.6/Resources/library